# Alexander F. Gazmararian
# agazmararian@gmail.com

INPUT_FILE <- "Statements_20250813.xlsx"
OUTPUT_FILE <- "statements_processed.csv"
ira_date <- as.Date("2022-08-16")

library(tidyverse)
library(tidylog)
library(here)
library(readxl)
library(janitor)
library(lubridate)

g <- read_xlsx(here("data", "input", "credit", INPUT_FILE), progress = FALSE)
g <- clean_names(g)

g <- g %>%
  mutate(
    across(contains("date"), ~as.Date(., format = "%Y/%m/%d"))
  )

# Adjust bracket notes RAs added
g <- g %>%
  mutate(
    state = if_else(state == "[NA accounted]", "Not announced", state),
    district = if_else(district == "[NA accounted]", "Not announced", district),
    district = gsub("\\[(.*?)\\]", "\\1", district),
    representative = gsub("\\[(.*?)\\]", "\\1", representative),
    # Create indicators for if brackets are present 
    bracket_company = str_detect(company, "\\[[^]]*\\]"),
    bracket_governor = str_detect(governor, "\\[[^]]*\\]"),
    bracket_representative = str_detect(representative, "\\[[^]]*\\]"),
    bracket_senator_1 = str_detect(senator_1, "\\[[^]]*\\]"),
    bracket_senator_2 = str_detect(senator_2, "\\[[^]]*\\]")
  )

g <- g %>%
  select(-contains("bracket"))

g <- g %>%
  mutate(across(
    where(is.character),
    ~ na_if(.x, "NA")
  ))

# Prepare data frame for annotation
g_long <- g %>%
  pivot_longer(
    cols = c(company_text, governor_text, senator_1_text, senator_2_text, representative_text, biden_text),
    names_to = "actor",
    values_to = "statement"
  ) %>%
  mutate(
    speaker_name = case_when(
      actor == "company_text" ~ company,
      actor == "governor_text" ~ governor,
      actor == "senator_1_text" ~ senator_1,
      actor == "senator_2_text" ~ senator_2,
      actor == "representative_text" ~ representative,
      actor == "biden_text" ~ "Joe Biden"
    ),
    speaker_role = case_when(
      actor == "company_text" ~ "Company",
      actor == "governor_text" ~ "Governor",
      actor == "senator_1_text" ~ "U.S. Senator",
      actor == "senator_2_text" ~ "U.S. Senator",
      actor == "representative_text" ~ "U.S. Representative",
      actor == "biden_text" ~ "President"
    ),
    release_type = case_when(
      actor == "company_text" ~ company_text_category_2,
      actor == "governor_text" ~ governor_text_category_2,
      actor == "senator_1_text" ~ senator_1_text_category_2,
      actor == "senator_2_text" ~ senator_2_text_category_2,
      actor == "representative_text" ~ representative_text_category_2,
      actor == "biden_text" ~ biden_text_category_1
    )
  )

# Pre-process statements
processed <- g_long %>%
  mutate(
    statement = gsub("\\[[^]]*\\]", " ", statement),
    statement = gsub("\\([^)]*\\)", " ", statement),
    statement = gsub("http[s]?://\\S+", " ", statement),
    statement = gsub("['']", "'", statement),
    statement = gsub('[""]', '"', statement),
    statement = gsub("[!?.]{2,}", ".", statement),
    statement = gsub("\\s+", " ", statement),
    statement = trimws(statement),
    statement = iconv(statement, to = "UTF-8")
  )

# Keep the original id column and add statement_id for annotation tracking
processed$statement_id <- seq_along(processed$statement)

# Verify date columns are correct type
date_cols <- names(processed)[grepl("date", names(processed))]
for (col in date_cols) {
  if (!inherits(processed[[col]], "Date")) {
    message(sprintf("Warning: Column %s is not a Date object (class: %s)", 
      col, class(processed[[col]])))
  }
}

# Clean the politician names
extract_last_name <- function(name) {
    purrr::map_chr(name, function(n) {
        if (is.na(n) || n == "") return(NA_character_)
        n <- str_to_lower(str_trim(n))
        if (str_detect(n, ",")) {
            return(str_trim(str_split_fixed(n, ",", 2)[1]))
        }
        word(n, -1)
    })
}

# Prepare politician names for merging
processed <- processed %>%
    mutate(
        gov_last = extract_last_name(governor),
        senator_1_last = extract_last_name(senator_1),
        senator_2_last = extract_last_name(senator_2),
        representative_last = extract_last_name(representative)
    )

# Fix typos or inconsistencies in governor names
processed$gov_last <- case_when(
    grepl("stitt", processed$gov_last) ~ "stitt",
    grepl("abbottt", processed$gov_last) ~ "abbott",
    grepl("huckabee sanders", processed$gov_last) ~ "sanders",
    TRUE ~ processed$gov_last
)

# Harmonize the district names for merging with current rep
processed <- processed %>%
    mutate(district_clean = case_when(
        district == "Not announced" ~ NA_real_,
        TRUE ~ suppressWarnings(as.numeric(str_replace(district, ".*-", "")))
    ))

# Harmonize the district names for merging with election data
clean_district_text <- function(district_text, ...) {
  removal_patterns <- c(...)
  pattern_collapse <- paste0(c(...), collapse = "|")
  str_remove_all(district_text, pattern_collapse)
}

single_district_states <- c("AK", "WY", "MT", "ND", "SD", "VT", "DE")
processed <- processed %>%
  mutate(
    district_num = ifelse(district == "Not announced", NA_real_, district),
    district_num = as.numeric(clean_district_text(district_num, ".*-", "\\[", "\\]")),
    # Handle single-district states (adjust district numbering)
    district_num = case_when(
      state %in% single_district_states ~ district_num - 1,
      TRUE ~ district_num
    ),
    election_year = case_when(
      date <= as.Date("2022-11-08") ~ 2020,
      TRUE ~ 2022
    )
  )

write_csv(processed, here("data", "inter", OUTPUT_FILE))
message(sprintf("[OK] Saved %s", here("data", "inter", OUTPUT_FILE)))